import re
import unicodedata

import regex

# non-ASCII letters that are not separated by "NFKD" normalization
ADDITIONAL_DIACRITICS = {
    'œ': 'oe',
    'Œ': 'OE',
    'ø': 'o',
    'Ø': 'O',
    'æ': 'ae',
    'Æ': 'AE',
    'ß': 'ss',
    'ẞ': 'SS',
    'đ': 'd',
    'Đ': 'D',
    'ð': 'd',
    'Ð': 'D',
    'þ': 'th',
    'Þ': 'th',
    'ł': 'l',
    'Ł': 'L',
}


def remove_symbols_and_diacritics(s: str, keep=''):
    """
    Replace any other markers, symbols, and punctuations with a space,
    and drop any diacritics (category 'Mn' and some manual mappings)
    """
    return ''.join(
        c
        if c in keep
        else ADDITIONAL_DIACRITICS[c]
        if c in ADDITIONAL_DIACRITICS
        else ''
        if unicodedata.category(c) == 'Mn'
        else ' '
        if unicodedata.category(c)[0] in 'MSP'
        else c
        for c in unicodedata.normalize('NFKD', s)
    )


def remove_symbols(s: str):
    """
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    """
    return ''.join(
        ' ' if unicodedata.category(c)[0] in 'MSP' else c
        for c in unicodedata.normalize('NFKC', s)
    )


class BasicTextNormalizer:
    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
        self.clean = (
            remove_symbols_and_diacritics if remove_diacritics else remove_symbols
        )
        self.split_letters = split_letters

    def __call__(self, s: str):
        s = s.lower()
        s = re.sub(r'[<\[][^>\]]*[>\]]', '', s)  # remove words between brackets
        s = re.sub(r'\(([^)]+?)\)', '', s)  # remove words between parenthesis
        s = self.clean(s).lower()

        if self.split_letters:
            s = ' '.join(regex.findall(r'\X', s, regex.U))

        s = re.sub(
            r'\s+', ' ', s
        )  # replace any successive whitespace characters with a space

        return s
